options(warn=-2)
library(pacman)
p_load(dplyr)
source('../lib/import.R')
#
# Es una librería de funciones comunes desarrolladas a partir de este TP.
import('../lib/common-lib.R')
## [1] "-> './reflection.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './csv.R' script loadded successfuly!"
## [1] "-> './plot.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './importance.R' script loadded successfuly!"
## [1] "-> './correlation.R' script loadded successfuly!"
## [1] "-> './pca.R' script loadded successfuly!"
## [1] "-> './set_split.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './models.R' script loadded successfuly!"
## ---
## biotools version 4.1
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './test.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './balance.R' script loadded successfuly!"
## [1] "-> '../lib/common-lib.R' script loadded successfuly!"
#
# Funciones especificas para este TP.
import('../scripts/helper_functions.R')
## [1] "-> '../lib/pca.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> '../lib/importance.R' script loadded successfuly!"
## [1] "-> '../scripts/helper_functions.R' script loadded successfuly!"
set.seed(1)

1. Cargamos el dataset

1.1. Cargamos el dataset nasa-asteroids y excluirnos observaciones con valores faltaste.

dataset <- loadcsv('../datasets/nasa.csv')
str(dataset)
## 'data.frame':    4687 obs. of  40 variables:
##  $ Neo.Reference.ID            : int  3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
##  $ Name                        : int  3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
##  $ Absolute.Magnitude          : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.KM.min.          : num  0.1272 0.1461 0.2315 0.0088 0.1272 ...
##  $ Est.Dia.in.KM.max.          : num  0.2845 0.3266 0.5177 0.0197 0.2845 ...
##  $ Est.Dia.in.M.min.           : num  127.2 146.1 231.5 8.8 127.2 ...
##  $ Est.Dia.in.M.max.           : num  284.5 326.6 517.7 19.7 284.5 ...
##  $ Est.Dia.in.Miles.min.       : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Est.Dia.in.Miles.max.       : num  0.1768 0.203 0.3217 0.0122 0.1768 ...
##  $ Est.Dia.in.Feet.min.        : num  417.4 479.2 759.5 28.9 417.4 ...
##  $ Est.Dia.in.Feet.max.        : num  933.3 1071.6 1698.3 64.6 933.3 ...
##  $ Close.Approach.Date         : chr  "1995-01-01" "1995-01-01" "1995-01-08" "1995-01-15" ...
##  $ Epoch.Date.Close.Approach   : num  7.89e+11 7.89e+11 7.90e+11 7.90e+11 7.90e+11 ...
##  $ Relative.Velocity.km.per.sec: num  6.12 18.11 7.59 11.17 9.84 ...
##  $ Relative.Velocity.km.per.hr : num  22017 65210 27327 40226 35427 ...
##  $ Miles.per.hour              : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..Astronomical.    : num  0.419 0.383 0.051 0.285 0.408 ...
##  $ Miss.Dist..lunar.           : num  163.2 149 19.8 111 158.6 ...
##  $ Miss.Dist..kilometers.      : num  62753692 57298148 7622912 42683616 61010824 ...
##  $ Miss.Dist..miles.           : num  38993336 35603420 4736658 26522368 37910368 ...
##  $ Orbiting.Body               : chr  "Earth" "Earth" "Earth" "Earth" ...
##  $ Orbit.ID                    : int  17 21 22 7 25 40 43 22 100 30 ...
##  $ Orbit.Determination.Date    : chr  "2017-04-06 08:36:37" "2017-04-06 08:32:49" "2017-04-06 09:20:19" "2017-04-06 09:15:49" ...
##  $ Orbit.Uncertainity          : int  5 3 0 6 1 1 1 0 0 0 ...
##  $ Minimum.Orbit.Intersection  : num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Jupiter.Tisserand.Invariant : num  4.63 5.46 4.56 5.09 5.15 ...
##  $ Epoch.Osculation            : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity                : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Semi.Major.Axis             : num  1.41 1.11 1.46 1.26 1.23 ...
##  $ Inclination                 : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude          : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period              : num  610 426 644 514 496 ...
##  $ Perihelion.Distance         : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg              : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist               : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Perihelion.Time             : num  2458162 2457795 2458120 2457902 2457814 ...
##  $ Mean.Anomaly                : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion                 : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Equinox                     : chr  "J2000" "J2000" "J2000" "J2000" ...
##  $ Hazardous                   : chr  "True" "False" "True" "False" ...

1.2. Las siguientes columnas las excluimos ya sea por que son no numerica, o colineales con las columna que si son parte de nuestro analisis.

excluded_columns <- c(
  'Neo.Reference.ID',
  'Name',
  'Close.Approach.Date',
  'Epoch.Date.Close.Approach',
  'Orbit.ID',
  'Orbit.Determination.Date',
  'Orbiting.Body',
  'Est.Dia.in.Feet.min.',
  'Est.Dia.in.Feet.max.',
  'Est.Dia.in.M.min.',
  'Est.Dia.in.M.max.',
  'Est.Dia.in.KM.min.',
  'Est.Dia.in.KM.max.',
  'Equinox',
  'Orbit.Uncertainity'
)

ds_step_1 <- dataset %>% dplyr::select(-excluded_columns) %>% na.omit
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(excluded_columns)` instead of `excluded_columns` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
rm(dataset)

str(ds_step_1)
## 'data.frame':    4687 obs. of  25 variables:
##  $ Absolute.Magnitude          : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.       : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Est.Dia.in.Miles.max.       : num  0.1768 0.203 0.3217 0.0122 0.1768 ...
##  $ Relative.Velocity.km.per.sec: num  6.12 18.11 7.59 11.17 9.84 ...
##  $ Relative.Velocity.km.per.hr : num  22017 65210 27327 40226 35427 ...
##  $ Miles.per.hour              : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..Astronomical.    : num  0.419 0.383 0.051 0.285 0.408 ...
##  $ Miss.Dist..lunar.           : num  163.2 149 19.8 111 158.6 ...
##  $ Miss.Dist..kilometers.      : num  62753692 57298148 7622912 42683616 61010824 ...
##  $ Miss.Dist..miles.           : num  38993336 35603420 4736658 26522368 37910368 ...
##  $ Minimum.Orbit.Intersection  : num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Jupiter.Tisserand.Invariant : num  4.63 5.46 4.56 5.09 5.15 ...
##  $ Epoch.Osculation            : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity                : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Semi.Major.Axis             : num  1.41 1.11 1.46 1.26 1.23 ...
##  $ Inclination                 : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude          : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period              : num  610 426 644 514 496 ...
##  $ Perihelion.Distance         : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg              : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist               : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Perihelion.Time             : num  2458162 2457795 2458120 2457902 2457814 ...
##  $ Mean.Anomaly                : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion                 : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Hazardous                   : chr  "True" "False" "True" "False" ...

1.3. Tomamo una muestra del 80% del dataset.

La idea de este paso es evitar tener los mismos resultado que otros análisis pre-existentes en kaggle. Para esto ordenamos aleatoria-mente las observaciones con el parámetro shuffle y luego tomamos una muestra del 80% de las observaciones.

c(ds_step_2, any) %<-% train_test_split(
  ds_step_1, 
  train_size = .8, 
  shuffle    = TRUE
)
## [1] "Train Set size: 3749"
## [1] "Test set size: 938"
rm(any)
rm(ds_step_1)

2. Eliminamos las columnas que están altamente co-relacionadas

Excluimos las columnas que tienen un correlación mayor al 80%. Del grupo altamenten correlacionesdos nos quedamos con una sola variable ya que todas con muy simialres.

high_correlated_columns <- find_high_correlated_columns(
  feat(ds_step_2), 
  cutoff=.9
)
## Compare row 21  and column  15 with corr  0.975 
##   Means:  0.283 vs 0.209 so flagging column 21 
## Compare row 8  and column  9 with corr  1 
##   Means:  0.286 vs 0.202 so flagging column 8 
## Compare row 9  and column  7 with corr  1 
##   Means:  0.252 vs 0.195 so flagging column 9 
## Compare row 7  and column  10 with corr  1 
##   Means:  0.215 vs 0.191 so flagging column 7 
## Compare row 15  and column  12 with corr  0.93 
##   Means:  0.26 vs 0.186 so flagging column 15 
## Compare row 12  and column  24 with corr  0.993 
##   Means:  0.23 vs 0.18 so flagging column 12 
## Compare row 4  and column  5 with corr  1 
##   Means:  0.282 vs 0.171 so flagging column 4 
## Compare row 5  and column  6 with corr  1 
##   Means:  0.237 vs 0.159 so flagging column 5 
## Compare row 3  and column  2 with corr  1 
##   Means:  0.208 vs 0.15 so flagging column 3 
## Compare row 22  and column  13 with corr  0.978 
##   Means:  0.128 vs 0.148 so flagging column 13 
## All correlations <= 0.9
ds_step_3 <- ds_step_2 %>% dplyr::select(-high_correlated_columns[-1])

rm(ds_step_2)
str(ds_step_3)
## 'data.frame':    3749 obs. of  16 variables:
##  $ Absolute.Magnitude        : num  25.8 17.7 23.3 19.5 28.1 ...
##  $ Est.Dia.in.Miles.min.     : num  0.01143 0.47633 0.03562 0.20792 0.00396 ...
##  $ Miles.per.hour            : num  48093 28253 38598 41366 9717 ...
##  $ Miss.Dist..miles.         : num  42744884 26690324 21643130 44921436 590913 ...
##  $ Minimum.Orbit.Intersection: num  0.01548 0.01962 0.00237 0.04906 0.00149 ...
##  $ Eccentricity              : num  0.24 0.578 0.447 0.464 0.147 ...
##  $ Inclination               : num  0.901 7.789 17.914 32.976 0.855 ...
##  $ Asc.Node.Longitude        : num  95.2 110.8 123 253.6 321.5 ...
##  $ Orbital.Period            : num  399 665 356 659 415 ...
##  $ Perihelion.Distance       : num  0.806 0.629 0.544 0.795 0.929 ...
##  $ Perihelion.Arg            : num  189.7 97.7 299.8 297.8 242.7 ...
##  $ Aphelion.Dist             : num  1.31 2.35 1.42 2.17 1.25 ...
##  $ Perihelion.Time           : num  2458030 2458226 2458028 2458131 2457884 ...
##  $ Mean.Anomaly              : num  333 238 333 289 101 ...
##  $ Mean.Motion               : num  0.903 0.541 1.012 0.546 0.867 ...
##  $ Hazardous                 : chr  "False" "True" "False" "True" ...

3. Tomamos las columnas que mejor separan las clases.

Para realizar este paso vamos a utilizar la función feature_importance del algoritmo Random Forest. Esta función nos permite comparar las variables descuerdo a cuan buenas son para separa las clases. Luego tomaremos las 5 variables que mejor separa las clases.

result <- features_importance(ds_step_3, target = 'Hazardous')
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target)` instead of `target` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
result
## 
## Call:
##  randomForest(x = features, y = target, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0.37%
## Confusion matrix:
##       False True class.error
## False  3142    7 0.002222928
## True      7  593 0.011666667
plot_features_importance(result)

n_best_features = 5
# n_best_features = 15

best_features <- top_acc_features(result, top=n_best_features)
## Selecting by MeanDecreaseGini
best_features
## [1] "Minimum.Orbit.Intersection" "Absolute.Magnitude"        
## [3] "Est.Dia.in.Miles.min."      "Perihelion.Distance"       
## [5] "Inclination"
length(best_features)
## [1] 5
ds_step_4 <- ds_step_3 %>% dplyr::select(c(best_features, c(Hazardous)))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(best_features)` instead of `best_features` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
rm(ds_step_3)
str(ds_step_4)
## 'data.frame':    3749 obs. of  6 variables:
##  $ Minimum.Orbit.Intersection: num  0.01548 0.01962 0.00237 0.04906 0.00149 ...
##  $ Absolute.Magnitude        : num  25.8 17.7 23.3 19.5 28.1 ...
##  $ Est.Dia.in.Miles.min.     : num  0.01143 0.47633 0.03562 0.20792 0.00396 ...
##  $ Perihelion.Distance       : num  0.806 0.629 0.544 0.795 0.929 ...
##  $ Inclination               : num  0.901 7.789 17.914 32.976 0.855 ...
##  $ Hazardous                 : chr  "False" "True" "False" "True" ...

4. Filtramos outliers

ds_step_5 <- filter_outliers_m1(ds_step_4, max_score = 0.51)

## [1] "Outliers: 7 %"
## [1] "Dataset without outliers: 93 %"
# ds_step_5 <- filter_outliers_m2(ds_step_4)

rm(ds_step_4)

5. Transformamos la varaible a predecir a tipo numericas

ds_step_6 <- ds_step_5 %>% 
  mutate(Hazardous = case_when(Hazardous %in% c('True') ~ 1, TRUE ~ 0))

rm(ds_step_5)
str(ds_step_6)
## 'data.frame':    3482 obs. of  6 variables:
##  $ Minimum.Orbit.Intersection: num  0.01548 0.01962 0.00237 0.04906 0.00149 ...
##  $ Absolute.Magnitude        : num  25.8 17.7 23.3 19.5 28.1 ...
##  $ Est.Dia.in.Miles.min.     : num  0.01143 0.47633 0.03562 0.20792 0.00396 ...
##  $ Perihelion.Distance       : num  0.806 0.629 0.544 0.795 0.929 ...
##  $ Inclination               : num  0.901 7.789 17.914 32.976 0.855 ...
##  $ Hazardous                 : num  0 1 0 1 0 0 0 0 0 0 ...

6. Análisis Exploratorio

6.1. Boxplot comparativos

coparative_boxplot(feat(ds_step_6), to_col=n_best_features)

6.2. Histogramas y densidad

comparative_histplot(feat(ds_step_6), to_col=n_best_features)

6.3. Analizamos gráfico de normalidad univariado

## [[1]]
## [1] 3.7e-24
## 
## [[2]]
## [1] 3.7e-24
## 
## [[3]]
## [1] 3.7e-24
## 
## [[4]]
## [1] 3.7e-24
## 
## [[5]]
## [1] 3.7e-24

Observaciones: Al parecer solo Perihelion.Distance parece ser normal o por lo enos la mas nromal de todas.

6.3. Test de normalidad uni-variado

uni_shapiro_test(feat(ds_step_6))
## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.81712, p-value < 2.2e-16
## 
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.98416, p-value < 2.2e-16
## 
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.71932, p-value < 2.2e-16
## 
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.98508, p-value < 2.2e-16
## 
## [1] "=> Variable: Inclination"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.91106, p-value < 2.2e-16

Observaciones: En todos los casos el p-valor < 0.05 y se rechaza normalidad en todas las variables. Esto coincido con los qqplot donde en todos los casos no son normales salvo Perihelion.Distance que parece tender anormalidad.

6.4. Test de normalidad muti-variado

mult_shapiro_test(feat(ds_step_6))
## 
##  Shapiro-Wilk normality test
## 
## data:  Z
## W = 0.65029, p-value < 2.2e-16

Observaciones: El p-valore < 0.05 por lo tanto se rechaza normalidad multi-variada. Se corresponde con el resultado de los tests de shapiro uni-variados y los qqplot’s.

6.5. Test de homocedasticidad multi-variado

multi_boxm_test(feat(ds_step_6), target(ds_step_6))
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  features
## Chi-Sq (approx.) = 2499.1, df = 15, p-value < 2.2e-16

Observaciones: El p-valor < 0.05 por lo tanto se rechaza la hipótesis nula y podemos decir que las variables no son homocedasticas.

6.6. Correlaciones entre variables

plot_correlations(feat(ds_step_6))

6.7. Análisis completo

6.8. PCA: Comparación de variables originales con/sin la variable a predecir.

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3482 individuals, described by 6 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3482 individuals, described by 5 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

6.9. PCA: Con observaciones discriminadas pro clase. Primero quitamos

outliers para poder ver con mas claridad el biplot.

plot_robust_pca(ds_step_6)

7. Train test split

Partimos el dataset en los conjuntos de entrenamiento y prueba. Ademas antes de partimos ordenamos las observaciones aleatoriamente para que los modelos de clasificación no aprendan secuencia si es que existe alguna en el orden original.

c(raw_train_set, raw_test_set) %<-% train_test_split(
  ds_step_6, 
  train_size = .8, 
  shuffle    = TRUE
)
## [1] "Train Set size: 2785"
## [1] "Test set size: 697"

8. Método SMOTE para balancear el dataset.

# balanced_train_set <- smote_balance(raw_train_set, raw_train_set$Hazardous)
#rm(raw_train_set)
balanced_train_set <- raw_train_set

9. Escalamos las variables numéricas

Restamos la media y dividimos por el desvío.

train_set <- balanced_train_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
test_set  <- raw_test_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))

rm(balanced_train_set)
rm(raw_test_set)

10. Clasificacion

Antes que nada se debe definir el valor de beta para la métrica F beta Score. En este problema debemos tener en cuenta dos factores para decidir cual el mejor valor de beta:

Por las razones ya explicadas necesitamos priorizar el recall por sobre la precisión. Para explicarlo de forma simple, es preferible predecir algun negativos como positivos que predecir perfectamente los positivos a costa de tener falsos negativos.

Finalmente para priorizar el recall elegimos un beta > 1.

beta = 2

10.1. Entrenamos un modelo LDA

lda_model <- lda(formula(Hazardous~.), train_set)
lda_threshold <- search_min_fn_threshold(
  lda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

lda_test_pred  <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)
plot_cm(lda_test_pred, target(test_set))

plot_roc(lda_test_pred, target(test_set))

fbeta_score(lda_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.574245939675174"
lda_model$scaling
##                                    LD1
## Minimum.Orbit.Intersection -1.23995904
## Absolute.Magnitude         -1.61449739
## Est.Dia.in.Miles.min.      -0.35803740
## Perihelion.Distance         0.05602890
## Inclination                -0.02190492

10.2. Entrenamos un modelo QDA

qda_model <- qda(formula(Hazardous~.), train_set)
qda_threshold <- search_min_fn_threshold(
  qda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)
plot_cm(qda_test_pred, target(test_set))

plot_roc(qda_test_pred, target(test_set))

fbeta_score(qda_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.856401384083045"

10.3. Entrenamos un modelo RDA

rda_model <- rda(formula(Hazardous~.), train_set)
rda_threshold <- search_min_fn_threshold(
  rda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

rda_test_pred  <- xda_predict(rda_model, feat(test_set),  rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)
plot_cm(rda_test_pred, target(test_set))

plot_roc(rda_test_pred, target(test_set))

fbeta_score(rda_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.856401384083045"

10.4. Entrenamos un modelo de regresión logística

lr_model <- glm(formula(Hazardous~.), train_set, family=binomial)
lr_threshold <- search_min_fn_threshold(
  lr_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

lr_test_pred  <- model_predict(lr_model, test_set,  lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)
plot_cm(lr_test_pred, target(test_set))

fp(lr_test_pred, target(test_set))
## [1] 26
fn(lr_test_pred, target(test_set))
## [1] 18
plot_roc(lr_test_pred, target(test_set))

fbeta_score(lr_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.805168986083499"

10.5. Entrenamos un modelo SVM

svm_model <- svm(formula(Hazardous~.), train_set, kernel = 'radial')
svm_threshold <- search_min_fn_threshold(
  svm_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)
plot_cm(svm_test_pred, target(test_set))

plot_roc(svm_test_pred, target(test_set))

fbeta_score(svm_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.58858501783591"

10.6. Entrenamos un modelo XGBoost

xgboost_model <- xgboost(
 as.matrix(feat(train_set)), 
 target(train_set),
 eta         = 0.2,
 max_depth   = 20,
 nround      = 15000,
 eval_metric = 'logloss',
 objective   = "binary:logistic",
 nthread     = 24,
 verbose     = 0
)
xgb_threshold <- search_min_fn_threshold(
  xgboost_model, 
  feat(test_set), 
  target(test_set),
  xgboost_predict
)

xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)
plot_cm(xgb_test_pred, target(test_set))

plot_roc(xgb_test_pred, target(test_set))

fbeta_score(xgb_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.98019801980198"

10.7. Comparativa de metricas

10.7.1 Metricas seleccionando el threshould con maximo AUR

data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
  ),
  False.Negative = c(
    fn(lda_test_pred, target(test_set)),
    fn(qda_test_pred, target(test_set)),
    fn(rda_test_pred, target(test_set)),
    fn(lr_test_pred,  target(test_set)),
    fn(svm_test_pred, target(test_set)),
    fn(xgb_test_pred, target(test_set))
  ),
  False.positive = c(
    fp(lda_test_pred, target(test_set)),
    fp(qda_test_pred, target(test_set)),
    fp(rda_test_pred, target(test_set)),
    fp(lr_test_pred,  target(test_set)),
    fp(svm_test_pred, target(test_set)),
    fp(xgb_test_pred, target(test_set))
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    lda_threshold, 
    qda_threshold, 
    rda_threshold, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>%
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
  ) %>%
  dplyr::select(
    False.Negative,
    False.positive,
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Model,
    Umbral
  ) %>%
  arrange(False.Negative, desc(Test.F2Score.Percent))

10.7.2 Metricas seleccionando el threshould con minimo False Negative

lda_threshold <- search_max_aur_threshold(
  lda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

lda_test_pred  <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)

qda_threshold <- search_max_aur_threshold(
  qda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)


rda_threshold <- search_max_aur_threshold(
  rda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

rda_test_pred  <- xda_predict(rda_model, feat(test_set),  rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)

lr_threshold <- search_max_aur_threshold(
  lr_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

lr_test_pred  <- model_predict(lr_model, test_set,  lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)

svm_threshold <- search_max_aur_threshold(
  svm_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)


xgb_threshold <- search_min_fn_threshold(
  xgboost_model, 
  feat(test_set), 
  target(test_set),
  xgboost_predict
)

xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)
data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
  ),
  False.Negative = c(
    fn(lda_test_pred, target(test_set)),
    fn(qda_test_pred, target(test_set)),
    fn(rda_test_pred, target(test_set)),
    fn(lr_test_pred,  target(test_set)),
    fn(svm_test_pred, target(test_set)),
    fn(xgb_test_pred, target(test_set))
  ),
  False.positive = c(
    fp(lda_test_pred, target(test_set)),
    fp(qda_test_pred, target(test_set)),
    fp(rda_test_pred, target(test_set)),
    fp(lr_test_pred,  target(test_set)),
    fp(svm_test_pred, target(test_set)),
    fp(xgb_test_pred, target(test_set))
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    lda_threshold, 
    qda_threshold, 
    rda_threshold, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>% 
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
  ) %>%
  dplyr::select(
    False.Negative,
    False.positive,
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Model,
    Umbral
  ) %>%
  arrange(False.Negative, desc(Test.F2Score.Percent))

10.7.3 Metricas seleccionando threshould=0.5

lda_threshold  <- 0.5
lda_test_pred  <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)

qda_threshold  <- 0.5
qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)


rda_threshold  <- 0.5
rda_test_pred  <- xda_predict(rda_model, feat(test_set),  rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)

lr_threshold  <- 0.5
lr_test_pred  <- model_predict(lr_model, test_set,  lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)

svm_threshold  <- 0.5
svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)

xgb_threshold  <- 0.5
xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)
data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
  ),
  False.Negative = c(
    fn(lda_test_pred, target(test_set)),
    fn(qda_test_pred, target(test_set)),
    fn(rda_test_pred, target(test_set)),
    fn(lr_test_pred,  target(test_set)),
    fn(svm_test_pred, target(test_set)),
    fn(xgb_test_pred, target(test_set))
  ),
  False.positive = c(
    fp(lda_test_pred, target(test_set)),
    fp(qda_test_pred, target(test_set)),
    fp(rda_test_pred, target(test_set)),
    fp(lr_test_pred,  target(test_set)),
    fp(svm_test_pred, target(test_set)),
    fp(xgb_test_pred, target(test_set))
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    lda_threshold, 
    qda_threshold, 
    rda_threshold, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>%
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
  ) %>%
  dplyr::select(
    False.Negative,
    False.positive,
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Model,
    Umbral
  ) %>%
  arrange(False.Negative, desc(Test.F2Score.Percent))

11. KMeans Clustering

11.1. Primero definimos cuantos grupos utilizar.

Típica con estimadores de la normal.

scaled_data_1 <- feat(ds_step_6) %>% mutate(~(scale(.) %>% as.vector))
clustering_metrics_plot(scaled_data_1)

Escalamiento diferente de la típica normal.

scaled_data_2 <- apply(
  feat(ds_step_6), 
  2, 
  function(x) { (x - min(x)) / (max(x) - min(x))}
)
clustering_metrics_plot(scaled_data_2)

11.2. Kmeans

n_clusters <- 2
kmeans_model <- kmeans(scaled_data_1, n_clusters)

km_ds_step_6 <- data.frame(ds_step_6)
km_ds_step_6$kmeans <- kmeans_model$cluster

10.2. biplot

# ds_without_outliers <- filter_outliers_m1(km_ds_step_6, max_score=0.5)
ds_without_outliers <- filter_outliers_m2(km_ds_step_6)
## [1] "Outliers: 5 %"
## [1] "Dataset without outliers: 95 %"
plot_robust_pca(
  ds_without_outliers %>% dplyr::select(-kmeans),
  groups = factor(ds_without_outliers$kmeans),
)

11.3 Hierarchical Clustering

Matriz de distancias euclídeas

mat_dist <- dist(x = ds_step_6, method = "euclidean") 

Dendrogramas (según el tipo de segmentación jerárquica aplicada)

hc_complete <- hclust(d = mat_dist, method = "complete") 
hc_average  <- hclust(d = mat_dist, method = "average")
hc_single   <- hclust(d = mat_dist, method = "single")
hc_ward     <- hclust(d = mat_dist, method = "ward.D2")

Calculo del coeficiente de correlación cofenetico

cor(x = mat_dist, cophenetic(hc_complete))
## [1] 0.7480964
cor(x = mat_dist, cophenetic(hc_average))
## [1] 0.7787258
cor(x = mat_dist, cophenetic(hc_single))
## [1] 0.5133303
cor(x = mat_dist, cophenetic(hc_ward))
## [1] 0.6451528

Construcción de un dendograma usando los resultados de la técnica de Ward

n_clusters <- 2

graphics.off()
plot(hc_ward )
rect.hclust(hc_ward, k=n_clusters, border="red")
hc_ds <- data.frame(ds_step_6)
hc_ds$her_ward <- cutree(hc_ward, k=n_clusters)
# ds_without_outliers <- filter_outlier_m1(hc_ds, max_score=0.57)
ds_without_outliers <- filter_outliers_m2(hc_ds)
## [1] "Outliers: 5 %"
## [1] "Dataset without outliers: 95 %"
plot_robust_pca(
  ds_without_outliers %>% dplyr::select(-her_ward),
  groups = factor(ds_without_outliers$her_ward),
  colours=c("orange","cyan","blue","magenta","yellow","black"),
  labels=c("grupo 1", "grupo 2","grupo 3","grupo 4","grupo 5","grupo 6")
)

 

Realizado por Adrian Marino

adrianmarino@gmail.com